import json
import re
import statistics

# 输入文件路径和输出文件路径
input_file_path = "dataset/minimind/pretrain_hq.jsonl"
output_file_path = "pretrain_1.4m_s512.jsonl"

# 初始化变量
max_length = 0
min_length = float("inf")  # 初始化为无穷大
lengths = []

# 打开输入文件和输出文件
with open(input_file_path, "r", encoding="utf-8") as input_file, \
     open(output_file_path, "w", encoding="utf-8") as output_file:
    for line in input_file:
        # 加载 JSON 数据
        data = json.loads(line)
        text = data["text"]
        
        # 使用正则表达式将 </s> <s> 替换为 </s><s>
        processed_text = re.sub(r'</s>\s+<s>', '</s><s>', text)
        
        # 将处理后的文本写入新的文件
        output_file.write(json.dumps({"text": processed_text}, ensure_ascii=False) + "\n")
        
        # 计算文本长度
        length = len(processed_text)
        lengths.append(length)
        
        # 更新最大值和最小值
        if length > max_length:
            max_length = length
        if length < min_length:
            min_length = length

# 计算平均值
average_length = statistics.mean(lengths)

# 输出结果
print(f"最大长度: {max_length}")
print(f"最小长度: {min_length}")
print(f"平均长度: {average_length}")